## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape
## This is MicrobeR 0.3.2. See https://jbisanz.github.io/MicrobeR/ for usage information.
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:ape':
## 
##     rotate
## The following object is masked from 'package:MicrobeR':
## 
##     mean_sd
SVtab<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_table.qza")$data %>% as.data.frame() 

SVseq<-read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_sequences.qza")$data %>% as.data.frame() %>% rename(c("SV"="x"))

taxonomy<-read.delim("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_d2taxonomy.txt", header=T)

lookup<-(SVseq %>% rownames_to_column("ASV")) %>%
  left_join(taxonomy, by="ASV") %>%
  column_to_rownames("ASV")
## Warning: Column `ASV` joining character vector and factor, coercing into
## character vector
#Here I am parsing out the different experiments. I use "IDEOmeta4" as the major variable for the bulk of the analysis
IDEOmeta <- read.csv("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_mouse_metadata.csv", header = T)
IDEOmeta <- filter(IDEOmeta, !is.na(SequencingID))
IDEOmeta<- filter(IDEOmeta, Organism=="Mouse")
rownames(IDEOmeta) <- IDEOmeta$SequencingID
IDEOmeta1 <- filter(IDEOmeta, Experiment == "IDEO1")
rownames(IDEOmeta1) <- IDEOmeta1$SequencingID
IDEOmeta2 <- filter(IDEOmeta, Experiment == "IDEO2")
rownames(IDEOmeta2) <- IDEOmeta2$SequencingID
IDEOmeta4 <- IDEOmeta
rownames(IDEOmeta4) <- IDEOmeta4$SequencingID

IDEO124SVtab <- SVtab[,rownames(IDEOmeta)]
IDEO1SVtab <- SVtab[,rownames(IDEOmeta1)]
IDEO2SVtab <- SVtab[,rownames(IDEOmeta2)]
IDEO4SVtab <- SVtab[,rownames(IDEOmeta4)]

#SV tree
SVtree_denovo <- read_qza("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/data/gnoto_updated_pipeline_outputs/ASV_denovotree.qza")

# SVtree_denovo$data<-SVtree_denovo$data %>%
#   ape::drop.tip(., taxonomy %>% filter(is.na(Phylum)) %>% pull(ASV)) %>%
#   ape::drop.tip(., "80e52422048b1f819347739465288380")
#Setting the color palette choice and statistical comparisons for the ggpubr package.
Whitecolor="#E69F00"
Chinesecolor="#0072B2"
statgroups <- list(c("Chinese", "White"))

# theme for pcoas
theme_pcoa<- function () { 
  theme_classic(base_size=10, base_family="Helvetica") +
  theme(axis.text = element_text(size=8, color = "black"), 
        axis.title = element_text(size=10, color="black"), 
        legend.text = element_text(size=8, color = "black"), 
        legend.title = element_text(size=10, color = "black"), 
        plot.title = element_text(size=10, color="black")) +
  theme(panel.border = element_rect(color="black", size=1, fill=NA))
}

#IDEO4 QC analysis

#Lookinat at the number of sequences per sample
histogram(colSums(IDEO4SVtab))

#Removing 0 sum rows; eliminates around half of the ASVs
IDEO4SVtab <- IDEO4SVtab[rowSums(IDEO124SVtab)>0,]
#Checking tree tips
plot(SVtree_denovo$data)

#SVtree_denovo
IDEO4tree <- drop.tip(SVtree_denovo$data,SVtree_denovo$data$tip.label[!SVtree_denovo$data$tip.label %in% rownames(IDEO4SVtab)])
#IDEO4tree
IDEO4trees <- list()
IDEO4trees$Raw <- IDEO4tree

#Diversity

#Filter and process ASV table for one version of comparing between donor and recipients
ASVs <- list()
ASVs$Raw <- IDEO4SVtab
ASVs$Filtered <- Confidence.Filter(ASVs$Raw, 3, 10, TRUE)
## Filtering features such that they are present in at least 3 samples with a total of at least 10 reads.
## ...There are 1450908 reads and 698  features
## ...After filtering there are 1414695 reads and 389 features
ASVs$CZM <- zCompositions::cmultRepl(t(ASVs$Filtered),
                                     label = 0,
                                     method = "CZM",
                                     output = "p-counts",
                                     suppress.print = TRUE) %>%
                                    t()

ASVs$CLR <- apply(log2(ASVs$CZM), 2, function(x)x-mean(x))
ASVs$Subsampled <- Subsample.Table(ASVs$Filtered, VERBOSE = T)
## Subsampling feature table to 6437 , currently has  389  taxa.
## ...sampled to 6437 reads with 388 taxa
#Beta diversity and ordination

#Drop tips for the filtered filtered and subsampled trees
IDEO4trees$Filtered <- drop.tip(IDEO4trees$Raw, tip = IDEO4trees$Raw$tip.label[!IDEO4trees$Raw$tip.label %in% rownames(ASVs$Filtered)])
IDEO4trees$Subsampled <- drop.tip(IDEO4trees$Raw, tip = IDEO4trees$Raw$tip.label[!IDEO4trees$Raw$tip.label %in% rownames(ASVs$Subsampled)])

#Create distance matrix list
DM <- list()
DM$Subsampled <- list()
DM$Filtered <- list()
DM$CZM <- list()

#Calculate the Distance Matrices and put them into the list
DM$Subsampled[["Subsampled Euclidean"]] <- dist(t(ASVs$Subsampled), method = "euclidean"); gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 4698994 251.0    7949214 424.6         NA  7949214 424.6
## Vcells 8106291  61.9   14786712 112.9      16384 12143152  92.7
DM$Subsampled[["Subsampled CLR Euclidean"]] <- dist(t(ASVs$CLR), method = "euclidean"); gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 4699120 251.0    7949214 424.6         NA  7949214 424.6
## Vcells 8107493  61.9   14786712 112.9      16384 12143152  92.7
DM$Subsampled[["Subsampled Bray Curtis"]] <- vegdist(t(Make.Proportion(ASVs$Subsampled)), method = "bray"); gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 4699244 251.0    7949214 424.6         NA  7949214 424.6
## Vcells 8108707  61.9   14786712 112.9      16384 12143152  92.7
DM$Subsampled[["Jensen-Shannon Divergence"]] <- phyloseq::distance(phyloseq(otu_table(Make.Proportion(ASVs$Subsampled), taxa_are_rows = T)), method = "jsd", parallel = T); gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 4712077 251.7    7949214 424.6         NA  7949214 424.6
## Vcells 8133342  62.1   14786712 112.9      16384 13888398 106.0
#DM$Subsampled[["weighted Unifrac"]] <- UniFrac(phyloseq(otu_table(Make.Proportion(ASVs$Subsampled), taxa_are_rows = T), phy=IDEO4trees$Subsampled), weighted = T, parallel = T); gc()
#DM$Subsampled[["Unweighted Unifrac"]] <- UniFrac(phyloseq(otu_table(Make.Proportion(ASVs$Subsampled), taxa_are_rows = T), phy=IDEO4trees$Subsampled), weighted = F, parallel = T); gc()

#Evaluate the distance distribution
allDMs <-
  lapply(names(DM$Subsampled), function(dname){
    DM$Subsampled[[dname]] %>%
      as.matrix() %>%
      as.data.frame() %>%
      rownames_to_column("Subject") %>%
      gather(-Subject, key = "Match", value=Distance) %>%
      mutate(Metric=dname)
    
  }) %>%
  do.call(bind_rows, .)

allDMs %>%
  mutate(Metric=factor(Metric, levels=c("Subsampled Euclidean", "Subsampled CLR Euclidean", "Subsampled Bray Curtis", "Jensen-Shannon Divergence", "weighted Unifrac", "Unweighted Unifrac"))) %>%
  ggplot(aes(x=Distance, color=Metric)) +
  geom_freqpoly() +
  theme_MicrobeR() +
  theme(legend.position = "none") +
  facet_wrap(~Metric, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Genearting a PhilR Transformation to match the human data Qi Yan is generating
IDEO4trees$PhilR <- IDEO4trees$Filtered
IDEO4trees$PhilR <- makeNodeLabel(IDEO4trees$PhilR, method="number", prefix = "n")
ASVs$PhilR <- t(philr(t(ASVs$CZM), IDEO4trees$PhilR, part.weights = 'enorm.x.gm.counts', ilr.weights = 'blw.sqrt'))
## Building Sequential Binary Partition from Tree...
## Building Contrast Matrix...
## Transforming the Data...
## Calculating ILR Weights...
## Warning in calculate.blw(tree, method = "sum.children"): Note: a total of 19
## tip edges with zero length have been replaced with a small pseudocount of the
## minimum non-zero edge length ( 5e-09 ).
#It is not subsampled, but putting it into the subsampled list will prevent additional modification to the code
DM$Subsampled[["PhilR Euclidean"]] <- dist(t(ASVs$PhilR), method = "euclidean") ; gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 4816330 257.3    7949214 424.6         NA  7949214 424.6
## Vcells 8378141  64.0   14786712 112.9      16384 13888398 106.0
## Subsampled Euclidean-Subsampled
## Subsampled CLR Euclidean-Subsampled
## Subsampled Bray Curtis-Subsampled
## Jensen-Shannon Divergence-Subsampled
## PhilR Euclidean-Subsampled
## Warning: Expected 2 pieces. Additional pieces discarded in 10 rows [31, 32, 33,
## 34, 35, 36, 37, 38, 39, 40].

## Warning: Expected 2 pieces. Additional pieces discarded in 10 rows [31, 32, 33,
## 34, 35, 36, 37, 38, 39, 40].
## Joining, by = "SequencingID"
## Warning: Column `SequencingID` joining character vector and factor, coercing
## into character vector
## Warning: Expected 2 pieces. Additional pieces discarded in 44 rows [133, 134,
## 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
## 151, 152, ...].
## Joining, by = "SequencingID"
## Warning: Column `SequencingID` joining character vector and factor, coercing
## into character vector

## Warning: Expected 2 pieces. Additional pieces discarded in 44 rows [133, 134,
## 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
## 151, 152, ...].
#Adonis for significance

adonis(DM$Subsampled$`PhilR Euclidean`~Diet*Ethnicity, data = IDEOmeta4, permutations = 999)
## 
## Call:
## adonis(formula = DM$Subsampled$`PhilR Euclidean` ~ Diet * Ethnicity,      data = IDEOmeta4, permutations = 999) 
## 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Diet            1    2849.0 2848.95 19.3274 0.29014  0.001 ***
## Ethnicity       1     614.2  614.16  4.1665 0.06255  0.007 ** 
## Diet:Ethnicity  1     459.9  459.89  3.1199 0.04684  0.016 *  
## Residuals      40    5896.2  147.40         0.60048           
## Total          43    9819.2                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Adjusting for donor with multiple recipient mice
perm.all <- how(nperm = 999)
setBlocks(perm.all) <- with(IDEOmeta4, SampleID)
adonis(DM$Subsampled$`PhilR Euclidean`~Diet*Ethnicity, data = IDEOmeta4, permutations = perm.all)
## 
## Call:
## adonis(formula = DM$Subsampled$`PhilR Euclidean` ~ Diet * Ethnicity,      data = IDEOmeta4, permutations = perm.all) 
## 
## Blocks:  with(IDEOmeta4, SampleID) 
## Permutation: free
## Number of permutations: 999
## 
## Terms added sequentially (first to last)
## 
##                Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
## Diet            1    2849.0 2848.95 19.3274 0.29014  0.001 ***
## Ethnicity       1     614.2  614.16  4.1665 0.06255  0.001 ***
## Diet:Ethnicity  1     459.9  459.89  3.1199 0.04684  0.068 .  
## Residuals      40    5896.2  147.40         0.60048           
## Total          43    9819.2                 1.00000           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
patches + plot_annotation(tag_levels = 'A')

#ggsave("/Volumes/turnbaughlab/qb3share/qiyanang/IDEO_manuscript_Rd2/figures/gnoto/Fig_S9.pdf", height=3.5, width=5.8, useDingbats=F)